In [1]:
%cd /mnt/Data/Jupyter/VisCA1
/mnt/Data/Jupyter/VisCA1
In [2]:
import pandas as pd
In [3]:
import warnings
warnings.filterwarnings("ignore")
%load_ext rpy2.ipython
In [4]:
%%R
require(tidyverse)
require(ggplot2)
require(plotluck)
require(reshape2)
require(cowplot)
theme_set(theme_minimal())

Exercise 1

Using Entity Type, create a tree map showing the sum of total-lobbying by type of government.

• What did you find out?

• What other questions would you want to answer?

• What calculations could you create that would inform your reporting? Identify up to three calculations (exploring Create Calculations feature).

In [5]:
lobbying = pd.read_excel("lobbying_data.xlsx")
lobbying["Total-Lobbying"] = lobbying["Compensation"] + lobbying["Expenses"]
lobbyingTree = lobbying[["entity_type", "Entity-cleaned", "Total-Lobbying"]]
lobbyingTree.rename(columns={"entity_type":"Type", "Entity-cleaned":"Entity", "Total-Lobbying":"Lobbying"}, inplace=True)
lobbyingTree.head()
Out[5]:
Type Entity Lobbying
0 CITIES Algona 18000.0
1 CITIES Arlington 13687.5
2 CITIES Auburn 1900.0
3 CITIES Battle Ground 12000.0
4 CITIES Bellevue 19000.0
In [6]:
%%R -i lobbyingTree
require(data.tree)

lobbyingTree$pathString <- paste('LOBBYING_TREE', lobbyingTree$Type, lobbyingTree$Entity, sep = "/")
tree <- as.Node(lobbyingTree[,])
print(tree, pruneMethod='dist', limit=20)
                                                               levelName
1  LOBBYING_TREE                                                        
2   ¦--CITIES                                                           
3   ¦   ¦--Algona                                                       
4   ¦   ¦--Arlington                                                    
5   ¦   °--... 48 nodes w/ 0 sub                                        
6   ¦--COUNTIES                                                         
7   ¦   ¦--Asotin County                                                
8   ¦   ¦--Clark County                                                 
9   ¦   °--... 16 nodes w/ 0 sub                                        
10  ¦--OTHER                                                            
11  ¦   ¦--Association Of Washington Cities                             
12  ¦   ¦--Association Of Washington State Public Facilities Districts  
13  ¦   °--... 18 nodes w/ 0 sub                                        
14  ¦--PORTS                                                            
15  ¦   ¦--Port Of Bellingham                                           
16  ¦   ¦--Port Of Bremerton                                            
17  ¦   °--... 9 nodes w/ 0 sub                                         
18  ¦--PUBLIC FACILITIES DISTRICTS                                      
19  ¦   ¦--Spokane Public Facilities District                           
20  ¦   °--Washington State Convention Center Public Facilities District
21  ¦--SCHOOL DISTRICTS                                                 
22  ¦   ¦--Seattle Public Schools                                       
23  ¦   °--... 3 nodes w/ 0 sub                                         
24  ¦--TRIBES                                                           
25  ¦   °--... 17 nodes w/ 0 sub                                        
26  °--UTILITY DISTRICTS                                                
27      °--... 9 nodes w/ 0 sub                                         
In [7]:
%%R
tree$Do(function(x) x$aggLobbying <- Aggregate(x, 'Lobbying', sum))
tree$Sort(attribute='Lobbying', decreasing=TRUE, recursive=TRUE)
print(tree, 'Lobbying', 'aggLobbying', pruneMethod='dist', limit=14)
                                      levelName  Lobbying aggLobbying
1  LOBBYING_TREE                                       NA  2523089.92
2   ¦--CITIES                                          NA   870602.31
3   ¦   ¦--Seattle                              138092.00   138092.00
4   ¦   ¦--Tacoma                                74100.00    74100.00
5   ¦   °--... 48 nodes w/ 0 sub                       NA          NA
6   ¦--COUNTIES                                        NA   321889.00
7   ¦   ¦--King County                           96654.00    96654.00
8   ¦   ¦--Pierce County                         67802.00    67802.00
9   ¦   °--... 16 nodes w/ 0 sub                       NA          NA
10  ¦--OTHER                                           NA   408145.84
11  ¦   ¦--Washington Indian Gaming Association  79319.35    79319.35
12  ¦   °--... 19 nodes w/ 0 sub                       NA          NA
13  ¦--PORTS                                           NA   218413.61
14  ¦   °--... 11 nodes w/ 0 sub                       NA          NA
15  ¦--PUBLIC FACILITIES DISTRICTS                     NA    24073.45
16  ¦   °--... 2 nodes w/ 0 sub                        NA          NA
17  ¦--SCHOOL DISTRICTS                                NA    60500.00
18  ¦   °--... 4 nodes w/ 0 sub                        NA          NA
19  ¦--TRIBES                                          NA   490778.71
20  ¦   °--... 17 nodes w/ 0 sub                       NA          NA
21  °--UTILITY DISTRICTS                               NA   128687.00
22      °--... 9 nodes w/ 0 sub                        NA          NA
In [8]:
%%R -w 8 -h 5 --units in -r 150
require(treemap)

treemap(ToDataFrameNetwork(tree, 'Lobbying', direction='climb')[9:139,], 
        index=c('from', 'to'), vSize='Lobbying', vColor='Lobbying', type='value', palette=heat.colors(-55))
In [9]:
%%R -w 12 -h 12 --units in -r 100
require(igraph)

treeGraph <- as.igraph(tree, 'aggLobbying', directed=F, direction='climb')
vertex_attr(treeGraph, 'aggLobbying')[1] <- 0

plot(treeGraph, vertex.color='lightblue', vertex.label.family='Segoe UI', 
     vertex.label.cex=0.55, vertex.label.color='black', layout=layout.kamada.kawai,
     vertex.size=vertex_attr(treeGraph, 'aggLobbying') / (max(vertex_attr(treeGraph, 'aggLobbying')) * 0.035))
In [10]:
longlat = pd.read_csv("zip_codes_states.csv")
longlat = longlat[longlat["state"] == "WA"]
longlat.drop_duplicates(subset="city", inplace=True)
longlat.set_index("city", inplace=True)
longlat.head()
Out[10]:
zip_code latitude longitude state county
city
Auburn 98001 47.465495 -121.821908 WA King
Federal Way 98003 47.432251 -121.803388 WA King
Bellevue 98004 47.615471 -122.207221 WA King
Black Diamond 98010 47.310568 -121.998721 WA King
Bothell 98011 47.750689 -122.214376 WA King
In [11]:
lobbyingGeo = lobbying[lobbying["entity_type"] == "CITIES"].join(longlat, how="left", on="City")[
    ["Total-Lobbying", "City", "county", "longitude", "latitude"]]
lobbyingGeo.rename(columns={"Total-Lobbying":"lobbying", "City":"city"}, inplace=True)
lobbyingGeo.drop(lobbyingGeo.index[44], inplace=True) # duplicate row
lobbyingGeo[lobbyingGeo.isnull().any(1)]
Out[11]:
lobbying city county longitude latitude
0 18000.00 Algona NaN NaN NaN
6 18000.00 Burien NaN NaN NaN
8 7166.68 Covington NaN NaN NaN
12 12061.47 Fife NaN NaN NaN
38 7000.00 Seatac NaN NaN NaN
41 25900.00 Shoreline NaN NaN NaN
In [12]:
lobbyingGeo.loc[0, "county"], lobbyingGeo.loc[0, "latitude"], lobbyingGeo.loc[0, "longitude"] = "King", 47.281954, -122.250388
lobbyingGeo.loc[6, "county"], lobbyingGeo.loc[6, "latitude"], lobbyingGeo.loc[6, "longitude"] = "King", 47.46917, -122.364291
lobbyingGeo.loc[8, "county"], lobbyingGeo.loc[8, "latitude"], lobbyingGeo.loc[8, "longitude"] = "King", 47.364791, -122.104563
lobbyingGeo.loc[12, "county"], lobbyingGeo.loc[12, "latitude"], lobbyingGeo.loc[12, "longitude"] = "Pierce", 47.23206, -122.351726
lobbyingGeo.loc[38, "county"], lobbyingGeo.loc[38, "latitude"], lobbyingGeo.loc[38, "longitude"] = "King", 47.44333, -122.298767
lobbyingGeo.loc[41, "county"], lobbyingGeo.loc[41, "latitude"], lobbyingGeo.loc[41, "longitude"] = "King", 47.756904, -122.342414

lobbyingGeo.head()

# data retrieved from http://citylatitudelongitude.com/
Out[12]:
lobbying city county longitude latitude
0 18000.0 Algona King -122.250388 47.281954
1 13687.5 Arlington Snohomish -121.959469 48.181498
2 1900.0 Auburn King -121.821908 47.465495
3 12000.0 Battle Ground Clark -122.531645 45.803592
4 19000.0 Bellevue King -122.207221 47.615471
In [13]:
duplicates = len(lobbyingGeo.drop_duplicates(subset=["longitude", "latitude"])) != len(lobbyingGeo)
print("Any Duplicate coordinates present?:", duplicates)
Any Duplicate coordinates present?: True
In [14]:
lobbyingGeo[lobbyingGeo["latitude"] == 
            float(lobbyingGeo[~lobbyingGeo.isin(lobbyingGeo.drop_duplicates(
                subset=["longitude", "latitude"]))].dropna()["latitude"])]
Out[14]:
lobbying city county longitude latitude
11 22490.0 Federal Way King -121.803388 47.432251
39 138092.0 Seattle King -121.803388 47.432251
In [15]:
lobbyingGeo.loc[11, "latitude"], lobbyingGeo.loc[41, "longitude"] = 47.322323, -122.312622

# sort out seattle location
In [16]:
print("Any Duplicate coordinates present?:", len(lobbyingGeo.drop_duplicates(subset=["longitude", "latitude"])) != len(lobbyingGeo))
Any Duplicate coordinates present?: False
In [17]:
%%R -i lobbyingGeo -w 6 -h 4 --units in -r 200
require(ggmap)

lat <- c(min(lobbyingGeo$latitude), max(lobbyingGeo$latitude))
lon <- c(min(lobbyingGeo$longitude), max(lobbyingGeo$longitude))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=6, source='google', maptype='terrain')
ggmap(plot, extent='device') + 
  geom_density2d(data=lobbyingGeo, aes(x=longitude, y=latitude), size=0.1, color='black') + 
  stat_density2d(data=lobbyingGeo, 
                 aes(x=longitude, y=latitude, fill=..level.., alpha=..level..), size=0.01, 
                 bins=16, geom='polygon') + 
  scale_fill_gradient(low='green', high='red', guide=FALSE) + 
  scale_alpha(range=c(0, 0.3), guide=FALSE) +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand=c(0.4, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand=c(0.4, 0))

# talk about using average measures to set map location
In [18]:
%%R -i lobbyingGeo -w 5 -h 5 --units in -r 200
plot <- get_map(location=c(median(lobbyingGeo$longitude), median(lobbyingGeo$latitude)), 
                zoom=9, source='google', maptype='roadmap')
ggmap(plot, extent='device') + 
  geom_point(aes(x=longitude, y=latitude), data=lobbyingGeo, col="darkred", alpha=0.4, 
              size=lobbyingGeo$lobbying/max(lobbyingGeo$lobbying)*15) +
  scale_alpha(range = c(0, 0.3), guide = FALSE)
In [19]:
# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = (20,10)

# refinery.plot()
In [20]:
%%R -o all_states
# require(maps)

all_states <- map_data("state")
In [21]:
refinery = pd.read_excel("19. Data Set - Module 6 - accidents-state.xlsx")[:-1]
refinery.head()
Out[21]:
State # of RMP facilities # of accidents # of deaths # of injuries # evacuated Property damage (dollars)
0 Alabama 196 31 0 23 59 182682
1 Alaska 43 7 0 1 20 103
2 Arizona 110 7 0 3 35 200000
3 Arkansas 165 47 0 63 1327 52232813
4 California 867 76 0 14076 65426 8531448
In [22]:
refinery["State"] = refinery["State"].str.lower()
In [23]:
nulls = all_states.join(refinery.set_index("State"), how="right", on="region")
nulls[nulls.isnull().any(1) == True]

# note on how guam, hawaii and alaska are low so will be ignored.  
# puerto rico and the virgin islands very high so will be ignored and explored for further interest.
# all will be left in the frame in order to facilitate any potenential calculations.
Out[23]:
long lat group order region subregion # of RMP facilities # of accidents # of deaths # of injuries # evacuated Property damage (dollars)
15599 NaN NaN NaN NaN alaska NaN 43 7 0 1 20 103
15599 NaN NaN NaN NaN guam NaN 4 0 0 0 0 0
15599 NaN NaN NaN NaN hawaii NaN 16 4 0 1 600 500
15599 NaN NaN NaN NaN puerto rico NaN 66 3 0 71 2000 500000
15599 NaN NaN NaN NaN virgin islands of the u.s. NaN 1 4 0 16 1300 8540000
In [24]:
refinery.rename(columns={"# of RMP facilities":"facility_count", "# of accidents":"accidents", 
                            "# of deaths":"deaths", "# of injuries":"injuries", 
                            "# evacuated":"evacuated", "Property damage (dollars)":"property_damage"}, inplace=True)
refineryGeo = all_states.join(refinery.set_index("State"), how="right", on="region")
refineryGeo.drop(columns="subregion", inplace=True)
refineryGeo.head()
Out[24]:
long lat group order region facility_count accidents deaths injuries evacuated property_damage
1 -87.462006 30.389681 1.0 1.0 alabama 196 31 0 23 59 182682
2 -87.484932 30.372492 1.0 2.0 alabama 196 31 0 23 59 182682
3 -87.525032 30.372492 1.0 3.0 alabama 196 31 0 23 59 182682
4 -87.530762 30.332386 1.0 4.0 alabama 196 31 0 23 59 182682
5 -87.570869 30.326654 1.0 5.0 alabama 196 31 0 23 59 182682
In [25]:
%%R -i refineryGeo -w 7 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

facility_count <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=facility_count), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)

damage <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=property_damage), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='chocolate', guide=FALSE)

injuries <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=injuries), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkred', guide=FALSE)

accidents <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=accidents), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)

evacuated <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=evacuated), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='chocolate', guide=FALSE)

deaths <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=deaths), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkred', guide=FALSE)

plot_grid(facility_count, damage, deaths, accidents, evacuated, injuries, nrow=2, ncol=3, 
          labels=c('Facilities', 'Damage', 'Deaths', 'Accidents', 'Evacuated', 'Injuries'))
In [26]:
refinery = refinery.set_index("State").drop(index=[region for region in nulls[nulls.isnull().any(1) == True].region]).reset_index()
refinery["scaled_accidents"] = pd.DataFrame([row["accidents"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_damage"] = pd.DataFrame([row["property_damage"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_evacuated"] = pd.DataFrame([row["evacuated"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_deaths"] = pd.DataFrame([row["deaths"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_injuries"] = pd.DataFrame([row["injuries"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 

refinery_scaled = pd.concat([refinery.iloc[:,[0, 1, 2, 3, 4, 5, 6]], 
                                pd.DataFrame(scaler.fit_transform(refinery.iloc[:,[7, 8, 9, 10, 11]]), 
             columns=["scaled_accidents", "scaled_damage", "scaled_evacuated", "scaled_deaths", "scaled_injuries"])], axis=1)

refineryGeo_scaled = all_states.join(refinery_scaled.set_index("State"), how="right", on="region")
refineryGeo_scaled.drop(columns="subregion", inplace=True)
refineryGeo_scaled.head()
Out[26]:
long lat group order region facility_count accidents deaths injuries evacuated property_damage scaled_accidents scaled_damage scaled_evacuated scaled_deaths scaled_injuries
1 -87.462006 30.389681 1.0 1 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
2 -87.484932 30.372492 1.0 2 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
3 -87.525032 30.372492 1.0 3 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
4 -87.530762 30.332386 1.0 4 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
5 -87.570869 30.326654 1.0 5 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
In [27]:
%%R -i refineryGeo_scaled -w 7 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

facility_count <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=facility_count), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)

damage <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_damage), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(legend.position='none')

injuries <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_injuries), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkred') +
  theme(legend.position='none')

accidents <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_accidents), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkblue') +
  theme(legend.position='none')

evacuated <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_evacuated), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(legend.position='none')

deaths <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_deaths), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkred') +
  theme(legend.position='none')

plot_grid(facility_count, damage, deaths, accidents, evacuated, injuries, nrow=2, ncol=3, 
          labels=c('Facilities', 'Damage', 'Deaths', 'Accidents', 'Evacuated', 'Injuries'))
In [39]:
%%R -i refineryGeo_scaled -w 2 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

collateral <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_damage+scaled_evacuated)/2), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(plot.title=element_text(size=10), legend.position='none') +
  ggtitle('Collateral Damage')   

human_cost <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_injuries+scaled_deaths)/2), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkred') +
  theme(plot.title=element_text(size=10), legend.position='none') +
  ggtitle('Human Cost')
 
plot_grid(collateral, human_cost, nrow=2, 
          labels=NULL)
In [37]:
%%R -i refineryGeo_scaled -w 7 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

collateral <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
#   geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_damage+scaled_evacuated)/2), colour='black', size=0.1, alpha=0.4) +
#   lims(fill=c(0, 1)) +
#   scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(plot.title=element_text(size=10), legend.position='none') +
  ggtitle('Collateral Damage') +
  geom_density2d(data=refineryGeo_scaled, aes(x=long, y=lat), size=0.1, color='black') + 
  stat_density2d(data=refineryGeo_scaled, 
                 aes(x=long, y=lat, fill=..level.., alpha=..level..), size=0.01, 
                 bins=16, geom='polygon') + 
  scale_fill_gradient(low='green', high='red', guide=FALSE) + 
  scale_alpha(range=c(0, 0.3), guide=FALSE)

collateral
    
# human_cost <- ggmap(plot, extent='device') +
#   scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
#   scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
#   geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_injuries+scaled_deaths)/2), colour='black', size=0.1, alpha=0.4) +
#   lims(fill=c(0, 1)) +
#   scale_fill_continuous(low='thistle2', high='darkred') +
#   theme(plot.title=element_text(size=10), legend.position='none') +
#   ggtitle('Human Cost')
 
# plot_grid(collateral, human_cost, nrow=2, 
#           labels=NULL)
In [60]:
import bs4 as bs
import requests

def state_land_area():
    resp = requests.get("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area")
    soup = bs.BeautifulSoup(resp.text, "lxml")
    table = soup.find("table", {"class" : "wikitable sortable"})
    landmass = []
    for row in table.findAll("tr")[2:]:
        state = row.findAll("td")[6].text
        landmass.append(state)
    return landmass
In [26]:
refinery = refinery.set_index("State").drop(index=[region for region in nulls[nulls.isnull().any(1) == True].region]).reset_index()
refinery["land_scaled_accidents"] = pd.DataFrame([row["accidents"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_damage"] = pd.DataFrame([row["property_damage"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_evacuated"] = pd.DataFrame([row["evacuated"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_deaths"] = pd.DataFrame([row["deaths"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_injuries"] = pd.DataFrame([row["injuries"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery

state_land_area()

scaler = MinMaxScaler() 
refinery_scaled = pd.concat([refinery.iloc[:,[0, 1, 2, 3, 4, 5, 6]], 
                                pd.DataFrame(scaler.fit_transform(refinery.iloc[:,[7, 8, 9, 10, 11]]), 
             columns=["scaled_accidents", "scaled_damage", "scaled_evacuated", "scaled_deaths", "scaled_injuries"])], axis=1)

refineryGeo_scaled = all_states.join(refinery_scaled.set_index("State"), how="right", on="region")
refineryGeo_scaled.drop(columns="subregion", inplace=True)
refineryGeo_scaled.head()
Out[26]:
long lat group order region facility_count accidents deaths injuries evacuated property_damage scaled_accidents scaled_damage scaled_evacuated scaled_deaths scaled_injuries
1 -87.462006 30.389681 1.0 1 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
2 -87.484932 30.372492 1.0 2 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
3 -87.525032 30.372492 1.0 3 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
4 -87.530762 30.332386 1.0 4 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
5 -87.570869 30.326654 1.0 5 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228